import pandas as pd
import numpy as np
from matplotlib.pyplot import *
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from datetime import datetime
from folium import Map, CircleMarker,Marker
from folium.plugins import MarkerCluster # for clustering the markers
#from google.colab import files
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.utils import all_estimators
import json
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, ShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
#csv = files.upload()
#df = pd.read_csv('/content/train.csv')
df = pd.read_csv('C:/Users/anais/Documents/Esilv/S7/Stage/Challenge/train.csv')
df.head()
df.isna().sum()
Il n'y a aucune valeur manquante.
df.nunique()
df.describe()
# Nous pouvons supprimer la colonne que nous n'allons pas utiliser.
df.pop('store_and_fwd_flag')
df.head()
plt.scatter(df.trip_duration,df.index,color="blue")
plt.xlabel("Trip Duration")
plt.title("Trip Duration for each Taxi ride");
xlim = [-74.03, -73.77]
ylim = [40.63, 40.85]
df = df[(df.pickup_longitude> xlim[0]) & (df.pickup_longitude < xlim[1])]
df = df[(df.dropoff_longitude> xlim[0]) & (df.dropoff_longitude < xlim[1])]
df = df[(df.pickup_latitude> ylim[0]) & (df.pickup_latitude < ylim[1])]
df = df[(df.dropoff_latitude> ylim[0]) & (df.dropoff_latitude < ylim[1])]
df["passenger_count"].value_counts().plot(kind='bar',color=["violet"])
plt.title("Passengers in a group of")
plt.xticks(rotation='horizontal')
plt.ylabel("Count for each passenger")
plt.xlabel("Number of Passengers");
df["passenger_count"].value_counts()
# We have 53 rides without passenger, we have to remove them.
df = df[df['passenger_count'] != 0]
df["passenger_count"].value_counts()
Nous allons transformer les variables pickup_datetime et dropoff_datetime en date, sinon nous ne pouvons pas les comparer
df.dtypes
def trips_ongoing(excel_filepath,datetime):
"""Return a dataframe with all the trips going at a given time"""
#first, we can gather all the trips that started before the datetime
df = pd.read_csv(excel_filepath)
df = df.astype({"pickup_datetime":"datetime64","dropoff_datetime":"datetime64"})
df = df[df['pickup_datetime'] < datetime]
#then we can gather all the trips that haven't stoped since
df = df[df['dropoff_datetime']>datetime]
return df
def visualize_trips(df):
"""Plot the trips on the new york map and return the map"""
#coordinates of New York :
xlim = [-74.03, -73.77]
ylim = [40.63, 40.85]
# We suppress the trips too far away from New York
df = df[(df.pickup_longitude> xlim[0]) & (df.pickup_longitude < xlim[1])]
df = df[(df.dropoff_longitude> xlim[0]) & (df.dropoff_longitude < xlim[1])]
df = df[(df.pickup_latitude> ylim[0]) & (df.pickup_latitude < ylim[1])]
df = df[(df.dropoff_latitude> ylim[0]) & (df.dropoff_latitude < ylim[1])]
longitude = list(df.pickup_longitude) + list(df.dropoff_longitude)
latitude = list(df.pickup_latitude) + list(df.dropoff_latitude)
#We map the position of each pickup location
m = Map(location=[40.767937,-73.982155], zoom_start=13)
stock = list(df['passenger_count'])
posi = []
for i in range(len(stock)):
posi.append(stock[i])
posi.append(stock[i])
#if the trip doesn't have any passenger, we make them red
for i in range(len(longitude)):
if(posi[i] !=0):
col = "#3186cc"
else:
col = "crimson"
#we add the circles on the map : one circle = one taxi
CircleMarker(
location=[latitude[i], longitude[i]],
radius=8,
popup="Laurelhurst Park",
color=col,
fill=True,
fill_color=col,
).add_to(m)
return m
df_test = trips_ongoing('C:/Users/anais/Documents/Esilv/S7/Stage/Challenge/train.csv', datetime(2016,6,7,11))
m = visualize_trips(df_test)
m
# Puisque les données datent toutes de 2016, essayons d'observer le nombre de voyages par mois
df['pickup_datetime'] = pd.to_datetime(df.pickup_datetime)
df['pickup_year']= df['pickup_datetime'].dt.year
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_day']= df['pickup_datetime'].dt.day
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_min']= df['pickup_datetime'].dt.minute
df['pickup_sec']=df['pickup_datetime'].dt.second
df['dropoff_datetime'] = pd.to_datetime(df.dropoff_datetime)
df['dropoff_year']= df['dropoff_datetime'].dt.year
df['dropoff_month'] = df['dropoff_datetime'].dt.month
df['dropoff_day']= df['dropoff_datetime'].dt.day
df['dropoff_hour'] = df['dropoff_datetime'].dt.hour
df['dropoff_min']= df['dropoff_datetime'].dt.minute
df['dropoff_sec']=df['dropoff_datetime'].dt.second
def NbTripsByMonth(df,year):
plt.figure(figsize=(15, 6))
df.pickup_month.value_counts().plot(kind='bar',color=["violet"],align='center',width=0.3)
plt.xticks(rotation='horizontal')
plt.xlabel("Months")
plt.ylabel("Number of trips")
plt.title(f"Number of trips by month in {year}");
NbTripsByMonth(df,2016)
def TripsPerMonth(dfo,month):
#si month = 0 alors on prend pickup_month
if(month == 0):
df = dfo.pickup_month
#sinon :
else:
df = dfo[dfo.pickup_datetime.dt.month == month ]
df = df.pickup_day
month_str = ""
if(month==0):
month_str="2016"
elif(month==1):
month_str="Janvier"
elif(month==2):
month_str="Février"
elif(month==3):
month_str="Mars"
elif(month==4):
month_str="Avril"
elif(month==5):
month_str="Mai"
elif(month==6):
month_str="Juin"
elif(month==7):
month_str="Juillet"
elif(month==8):
month_str="Aout"
elif(month==9):
month_str="Septembre"
elif(month==10):
month_str="Octobre"
elif(month==11):
month_str="Novembre"
elif(month==12):
month_str="Décembre"
if(month == 0):
xlabel_str = "Mois"
title_str = f"Nombre de voyage par mois en {month_str}"
else:
xlabel_str="Jour"
title_str = f"Nombre de voyage par jour en {month_str}"
plt.figure(figsize=(15, 6))
df.value_counts().plot(kind='bar',color=["violet"],align='center',width=0.3)
plt.xticks(rotation='horizontal')
plt.xlabel(xlabel_str)
plt.ylabel("Nombre de voyage")
plt.title(title_str);
TripsPerMonth(df,1)
def ClustersPerMonth(dfo,month=0, week = 0, day=0, weekday = -1, hour= -1):
"""Takes the month and the number of clusters wanted, and returns a map with the clusters, and their coordinates"""
#we select the month wanted (from 1 to 12)
if (month != 0):
df = dfo[dfo.pickup_datetime.dt.month == month ]
else:
df = dfo
#we select the wanted week (from 1 to 52)
if (week != 0):
df = df[df.pickup_datetime.dt.isocalendar().week == week]
#we select the wanted day (from 1 to 31)
if (day!=0):
df = df[df.pickup_datetime.dt.day == day]
#we select the wanted weekday (from 0 to 6)
if (weekday != -1):
df = df[df.pickup_datetime.dt.weekday == weekday]
#we select the wanted hour (from 0 to 23)
if (hour != -1):
df = df[df.pickup_datetime.dt.hour == hour]
#si la date entrée n'existe pas, on prend toute la base de données
if (len(df)==0):
print("La date entrée n'existe pas ou aucun trajet n'a été effectué à cette date.")
df = dfo
print('les données correspondantes ont été sélectionnées')
print(f'{len(df)} voyages correspondent')
longitude = list(df.pickup_longitude) + list(df.dropoff_longitude)
latitude = list(df.pickup_latitude) + list(df.dropoff_latitude)
loc_df = pd.DataFrame()
loc_df['longitude'] = longitude
loc_df['latitude'] = latitude
#kmeans = KMeans(n_clusters=nbClusters, random_state=2, n_init = 10).fit(loc_df)
#loc_df['label'] = kmeans.labels_
m = Map(location=[40.767937,-73.982155], zoom_start=12)
print("la map a été générée")
marker_cluster = MarkerCluster().add_to(m) # create marker clusters
print("les clusters sont en cours d'ajout...")
for coo in range(len(loc_df['longitude'])):
if(coo%1000==0):
print(f'[{coo}/{len(loc_df["longitude"])}] markers ajoutés')
location = [latitude[coo],longitude[coo]]
tooltip = "Coordinates : {} <br>".format(location)
Marker(location, # adding more details to the popup screen using HTML
tooltip=tooltip).add_to(marker_cluster)
print("Tous les markers ont été ajoutés !")
return m
m = ClustersPerMonth(df,month=1,hour = 11)
m.save("./map.html")
m